Getting ready

library(knitr)
library(lubridate)
library(readr)
library(dplyr)
library(ggplot2)
library(plotly)
library(DT)
library(highcharter)
library(ggmap)
# when I downloaded the data from Kaggle the file name was "other-Lyft_B02510.csv" 
# I changed it to "Lyft.csv"

lyft <- read_csv('Lyft.csv')

colnames(lyft) <- c('time_of_trip', 'lat', 'lon', 'X4')

lyft <- lyft %>% 
  select(time_of_trip, lat, lon) %>%
  mutate(time_of_trip = mdy_hm(time_of_trip) ) %>%
  mutate(Day = day(time_of_trip),
         Month = month(time_of_trip, label = TRUE, abbr = FALSE),
         Year = year(time_of_trip),
         Hour  = factor(hour(time_of_trip)),
         Weekday = wday(time_of_trip, label = TRUE, abbr = FALSE) )

nyc <- c(lon = mean(lyft$lon, na.rm = TRUE), lat = mean(lyft$lat, na.rm = TRUE) )
nyc_map <- get_map(location = nyc, zoom = 13)

Introduction

This data set contains information about Lyft’s pickups in New York City from July 25, 2014 to September 30, 2014. There were a total of 267,701 pickups (rows) in 68 days. Source: Kaggle.

n_month <- lyft %>% 
  group_by(Month, Year) %>%
  summarise(Pickups = n(), first_day = min(Day), last_day = max(Day))  %>% 
  select(Year, Month, first_day, last_day, Pickups)

n_month$Year <- as.character(n_month$Year)
n_month$first_day <- as.character(n_month$first_day)
n_month$last_day <- as.character(n_month$last_day)
n_month$Pickups <- prettyNum(n_month$Pickups, big.mark = ',')

colnames(n_month)[3:4] <- c('Firsr Day', 'Last Day')

kable(n_month)
Year Month Firsr Day Last Day Pickups
2014 July 25 31 4,254
2014 August 1 31 147,448
2014 September 1 30 115,999



Pickups per Hour

n_hour <- lyft %>%
  group_by(Hour) %>%
  summarise(Pickups = n() ) 

ggplotly(ggplot(n_hour, aes(Hour, Pickups) ) + 
  geom_bar(stat = 'identity', fill = 'deeppink3') +
  ggtitle("Number of Pickups per Hour") ) 

Lyft Pickups in NYC

This map displays all the Lyft pickups from July 25, 2014 to September 30, 2014. The highest demand for Lyft pickups occur during 9pm - 12am and 12am - 3am. It was intresting to find out that the least amount of pickups occur during 9am - 12pm!

D. Kahle and H. Wickham. ggmap: Spatial Visualization with ggplot2. The R Journal, 5(1), 144-161.

lyft$Hour <- as.integer(lyft$Hour)

lyft <- lyft %>%
  mutate(Day_Segment = Hour ) %>%
  mutate(Day_Segment = if_else(Day_Segment <= 3, "00-03",
                               if_else(Day_Segment <= 6, "03-06", 
                                       if_else(Day_Segment <= 9, "06-09",
                                               if_else(Day_Segment <= 12, "09-12",
                                                       if_else(Day_Segment <= 15, "12-15",
                                                               if_else(Day_Segment <= 18,                                                                                         "15-18",
                                                                       if_else(Day_Segment <=                                                                     21, "18-21", "21-24"))))))))

12:00 AM - 3:00 AM

lyft1 <- lyft %>% 
  filter(Day_Segment == '00-03' )

ggmap(nyc_map, base_layer = ggplot(lyft1, aes(lon, lat) ), extent = 'device', legend = 'none')+
 geom_point(alpha = 1/20, size = 0.8, colour = 'deeppink3')

3:00 AM - 6:00 AM

lyft2 <- lyft %>%
  filter(Day_Segment == '03-06' )

ggmap(nyc_map, base_layer = ggplot(lyft2, aes(lon, lat) ), extent = 'device', legend = 'none')+
 geom_point(alpha = 1/20, size = 0.8, colour = 'deeppink3')

6:00 AM - 9:00 AM

lyft3 <- lyft %>%
  filter(Day_Segment == '06-09' )

ggmap(nyc_map, base_layer = ggplot(lyft3, aes(lon, lat) ), extent = 'device', legend = 'none')+
 geom_point(alpha = 1/20, size = 0.8, colour = 'deeppink3')

9:00 AM - 12:00 PM

lyft4 <- lyft %>%
  filter(Day_Segment == '09-12' )

ggmap(nyc_map, base_layer = ggplot(lyft4, aes(lon, lat) ), extent = 'device', legend = 'none')+
 geom_point(alpha = 1/20, size = 0.8, colour = 'deeppink3')

12:00 PM - 3:00 PM

lyft5 <- lyft %>%
  filter(Day_Segment == '12-15' )

ggmap(nyc_map, base_layer = ggplot(lyft5, aes(lon, lat) ), extent = 'device', legend = 'none')+
 geom_point(alpha = 1/20, size = 0.8, colour = 'deeppink3')

3:00 PM - 6:00 PM

lyft6 <- lyft %>%
  filter(Day_Segment == '15-18' )

ggmap(nyc_map, base_layer = ggplot(lyft6, aes(lon, lat) ), extent = 'device', legend = 'none')+
 geom_point(alpha = 1/20, size = 0.8, colour = 'deeppink3')

6:00 PM - 9:00 PM

lyft7 <- lyft %>%
  filter(Day_Segment == '18-21' )

ggmap(nyc_map, base_layer = ggplot(lyft7, aes(lon, lat) ), extent = 'device', legend = 'none')+
 geom_point(alpha = 1/20, size = 0.8, colour = 'deeppink3')

9:00 PM - 12:00 AM

lyft8 <- lyft %>%
  filter(Day_Segment == '21-24' )

ggmap(nyc_map, base_layer = ggplot(lyft8, aes(lon, lat) ), extent = 'device', legend = 'none')+
 geom_point(alpha = 1/20, size = 0.8, colour = 'deeppink3')



Pickups per Weekday

There is an increase in demand on the weekends.

Wdays <- lyft %>%
  group_by(Weekday) %>%
  summarise(Pickups = n() )

ggplotly(ggplot(Wdays, aes(Weekday, Pickups ) ) +
  geom_bar(stat = 'identity', fill = 'deeppink3' ) +
  ggtitle('Number of Pickups per Weekday'))
daysOfWeek <- lyft %>% 
  group_by(Day, Month, Weekday) %>%
  summarise(Pickups = n() ) 

daysOfWeek1 <- lyft %>% 
  filter(Month != 'July') %>%
  group_by(Day, Month, Weekday) %>%
  summarise(Pickups = n() ) %>%
  ungroup()

avg_pickups <- daysOfWeek1 %>%
  group_by(Weekday) %>%
  summarise(avg_pickups = mean(Pickups) )

weekends <- daysOfWeek1 %>%
  filter(  Weekday == 'Saturday' | Weekday == 'Sunday')
wdays <- daysOfWeek1 %>%
  filter(Weekday == 'Monday' | Weekday == 'Tuesday' | Weekday == 'Wednesday' | Weekday == 'Thursday' | Weekday == 'Friday' )

weekends <- weekends %>%
  summarise(Weekday = 'Weekends (Sat & Sun)'  , avg_pickups = mean(Pickups) )

wdays <- wdays %>%
  summarise(Weekday = 'Weekdays (Mon-Fri)'  , avg_pickups = mean(Pickups) )

avg_pickups <- rbind(avg_pickups, weekends)
avg_pickups <- rbind(avg_pickups, wdays)

kable(avg_pickups, caption = 'Average number of pickups per Weekday' )
Average number of pickups per Weekday
Weekday avg_pickups
Sunday 5046.667
Monday 3280.778
Tuesday 3321.667
Wednesday 3712.625
Thursday 4420.125
Friday 4756.333
Saturday 5637.333
Weekends (Sat & Sun) 5342.000
Weekdays (Mon-Fri) 3890.488

Demand increases 37% on the weekends.

Note: the observations from the last seven days of July have been excluded from the calculation of the mean values shown in this table because there are so few pickups in July compared to August and September. In the next two plots, notice how the distribution of the number of pickups per Weekday changes when the last seven days of July are included.

Distribution of Pickups per Weekday

Look for the changes in the minimum number of pickups per Weekday.

August and September

ggplotly(ggplot(daysOfWeek1, aes(Weekday, Pickups, fill = Weekday ) ) +
  geom_boxplot() +
  #scale_y_log10() +
  ggtitle("Distribution of Pickups per Weekday") +
  #theme(legend.position = "top") +
  coord_flip())

Including the last 7 days of July

ggplotly(ggplot(daysOfWeek, aes(Weekday, Pickups, fill = Weekday ) ) +
  geom_boxplot() +
  #scale_y_log10() +
  ggtitle("Distribution of Pickups per Weekday") +
  #theme(legend.position = "top") +
  coord_flip())

Pickups per Day of the Month

In this table we have the number of pickups for each of the 68 days that are recorded in this dataset.

datatable(daysOfWeek, rownames = FALSE)



The last seven days of July had the least number of pickups.

Number of Pickups per Day

August and September

lyft$Day <- as.factor(lyft$Day)

days <- lyft %>%
  filter(Month != 'July') %>%
  group_by(Day, Month) %>%
  summarise(Pickups = n() )

ggplotly(ggplot(days, aes(Day, Pickups, fill = Month ) ) +
  geom_bar(stat = 'identity' ) + 
  ggtitle('Number of Pickups per Day') ) 

August

aug <- lyft %>%
  filter(Month == 'August') %>%
  group_by(Day) %>%
  summarise(Pickups = n() )

ggplotly(ggplot(aug, aes(Day, Pickups) ) +
  geom_bar(stat = 'identity', fill = 'deeppink3' ) +
  ggtitle("Number of Pickups per Day, August"))

September

sep <- lyft %>% 
  filter(Month == 'September') %>%
  group_by(Day) %>%
  summarise(Pickups = n() )

ggplotly(ggplot(sep, aes(Day, Pickups) ) +
  geom_bar(stat = 'identity', fill = 'deeppink3' ) +
  ggtitle('Number of Pickups per Day, September'))

Including the last 7 days of July

days <- lyft %>% 
  group_by(Day, Month) %>%
  summarise(Pickups = n() )

ggplotly(ggplot(days, aes(Day, Pickups, fill = Month) ) +
  geom_bar(stat = 'identity') +
  ggtitle('Number of Pickups per Day'))

Another picture

In this plot, “day 1” is July 25th and “day 68” is September 30th.

days <- days %>%
  ungroup() %>% 
  arrange(Month) 

days <- days %>%
  mutate(day_n = rownames(days) ) 
days$day_n <- as.integer(days$day_n)

ggplotly( ggplot(days, aes(day_n, Pickups, color = Month ) ) + 
  geom_point() +
  ggtitle('Number of Pickups per Day'))



Pickups growth by month: August VS September

growth <- lyft %>%
  filter(Month != 'July') %>%
  group_by(Day, Month) %>%
  summarise(Pickups = n() ) %>%
  ungroup() %>%
  group_by(Month) %>%
  mutate(cum_pickups = cumsum(Pickups) ) %>%
  ungroup()

ggplotly(ggplot(growth, aes(Day, cum_pickups, color = Month) ) + 
  geom_point() +
  ggtitle('Cumulative Sum of Pickups per Day of the Month'))



Summary



Take a look at the original data set

Showing the first 20 rows

# when I downloaded the data from Kaggle the file name was "other-Lyft_B02510.csv" 
# I changed it to "Lyft.csv"

lyft <- read_csv('Lyft.csv')

datatable(head(lyft, n= 20 ))



Contact Me

Email: javier.zuniga1912@gmail.com

LinkedIn